# Identify values causing the issueproblematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]#> Warning: NAs introduced by coercion# Replace non-numeric values with NAproperties$number_of_rooms <-as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))# Remove non-numeric characters and convert to numericproperties$price <-as.numeric(gsub("[^0-9]", "", properties$price))# Subset the dataset to exclude rows with price < 20000properties <- properties[properties$price >=20000, ]# Subset the dataset to exclude rows with numbers of rooms < 25properties <- properties[properties$number_of_rooms <25, ]# Replace incomplete addressesproperties$address <-gsub("^\\W*[.,0-]\\W*", "", properties$address)properties_filtered <-na.omit(properties)properties_filtered$year_category <-substr(properties_filtered$year_category, 1, 9)# Assuming 'year_category' is a column in the 'properties' datasetproperties_filtered$year_category <-as.factor(properties_filtered$year_category)# Preprocess the number_of_rooms columnproperties_filtered$number_of_rooms <-as.character(properties_filtered$number_of_rooms)properties_filtered$number_of_rooms <-gsub("\\D", "", properties_filtered$number_of_rooms) # Remove non-numeric charactersproperties_filtered$number_of_rooms <-as.numeric(properties_filtered$number_of_rooms) # Convert to numericproperties_filtered$number_of_rooms <-trunc(properties_filtered$number_of_rooms) # Truncate non-integer values# remove m^2 from column 'square_meters'properties_filtered$square_meters <-as.numeric(gsub("\\D", "", properties_filtered$square_meters))# print how many NA observations left in square_metersprint(sum(is.na(properties_filtered$square_meters)))#> [1] 988# remove NAproperties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]# add majuscule to cantonproperties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)# show 100 first row of cleaned dataset using reactablereactable(head(properties_filtered, 100))
2.1.3 AMTOVZ_CSV_LV95 Data
ajouter source
ajouter description
expliquer blabla
2.1.3.1 Creating Variable zip_code and merging with AMTOVZ_CSV_LV95
Code
df <- properties_filtered#the address column is like : '1844 Villeneuve VD' and has zip code number in it#taking out the zip code number and creating a new column 'zip_code'#the way to identify the zip code is to identify numbers that are 4 digits longdf$zip_code <-as.numeric(gsub("\\D", "", df$address))#removing the first two number of zip code has more than 4 numberdf$zip_code <-ifelse(df$zip_code >9999, df$zip_code %%10000, df$zip_code)
2.1.3.2 Using AMTOVZ_CSV_LV95 to get the city and canton from the zip code
Code
#read .csv AMTOVZ_CSV_LV95amto <-read.csv(file.path(here(),"data/AMTOVZ_CSV_LV95.csv"), sep =";")#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code' and 'KantonskÃ.rzel' as 'Canton_code'amto_df <-data.frame(City=amto$Ortschaftsname, zip_code=amto$PLZ, Canton_code=amto$Kantonskürzel)# display 100 first rows of atmo_df using reactablereactable::reactable(head(amto_df, 1000))
The zip code was incorectly isolated from the address
Removed them ::: {.cell layout-align=“center”}
Code
#remove the rows with nan in cityproperties_filtered <- df[!is.na(df$City),]#show the first 100 rows of the cleaned dataset using reactablereactable(head(properties_filtered, 100))
:::
2.1.4 Tax data
ajouter source
ajouter description
expliquer blabla
2.1.4.1 Cleaning
Code
# read csvimpots <-read.csv(file.path(here(),"data/estv_income_rates.csv"), sep =",", header =TRUE, stringsAsFactors =FALSE)# Remove 1st rowimpots <- impots[-1, ]# Remove 3rd columnimpots <- impots[, -3]# Combine text for columns 4-8impots[1, 4:8] <-"Impôt sur le revenu"# Combine text for columns 9-13impots[1, 9:13] <-"Impôt sur la fortune"# Combine text for columns 14-16impots[1, 14:16] <-"Impôt sur le bénéfice"# Combine text for columns 17-19impots[1, 17:19] <-"Impôt sur le capital"# Combine content of the first 2 rows into the 2nd rowimpots[2, ] <-apply(impots[1:2, ], 2, function(x) paste(ifelse(is.na(x[1]), x[2], ifelse(is.na(x[2]), x[1], paste(x[1], x[2], sep =" ")))))# Remove 1st rowimpots <- impots[-1, ]# Assign the text to the 1st row and 1st columnimpots[1, 1] <-"Coefficient d'impôt en %"# Replace column names with the content of the first rowcolnames(impots) <- impots[1, ]impots <- impots[-1, ]# Check for missing values in impotsany_missing <-any(is.na(impots))if (any_missing) {print("There are missing values in impots.")} else {print("There are no missing values in impots.")}#> [1] "There are no missing values in impots."# Replace row names with the content of the 3rd columnrow.names(impots) <- impots[, 3]impots <- impots[, -3]# Remove 2nd column (to avoid canton column)impots <- impots[, -2]# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))# Replace NA values with 0cleaned_impots[is.na(cleaned_impots)] <-0# Check for non-numeric valuesnon_numeric <-sum(!is.na(cleaned_impots) &!is.numeric(cleaned_impots))if (non_numeric >0) {print(paste("Warning: Found", non_numeric, "non-numeric values."))}# Perform clustering or any other analysis with cleaned_impotsreactable(head(cleaned_impots, 100))
2.1.4.2 Merging the two datasets
Code
library(dendextend)# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))cleaned_impots[is.na(cleaned_impots)] <-0# Replace NA values with 0# Scale the featuresscaled_impots <-scale(cleaned_impots)# Calculate distance matrixdist_matrix <-dist(scaled_impots, method ="euclidean")# Perform hierarchical clusteringhclust_model <-hclust(dist_matrix, method ="ward.D2")# Create dendrogramdend <-as.dendrogram(hclust_model)# Zoom range for the dendrogramy_zoom_range <-c(0, 10)# Adjust font size for better visualizationpar(cex =0.6)# Plot dendrogramplot(dend, main ="Hierarchical Clustering Dendrogram", horiz =FALSE, ylim = y_zoom_range) # Set horiz to FALSE for vertical dendrogram# Cut the dendrogram to obtain clustersclusters <-cutree(hclust_model, k =5) # Adjust the number of clusters as needed# Color branches according to clusterscolored_dend <-color_branches(dend, k =5)plot(colored_dend, main ="Hierarchical Clustering Dendrogram", horiz =FALSE, ylim = y_zoom_range)
2.1.5 Commune Data
2.1.5.1 Cleaning
ajouter source
ajouter description
expliquer blabla
Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
Code
# il faudra changer le pathcommune_prep <-read.csv(file.path(here(),"data/commune_data.csv"), sep =";", header =TRUE, stringsAsFactors =FALSE)# We keep only 2019 to have some reference? (2020 is apparently not really complete)commune_2019 <-subset(commune_prep, PERIOD_REF =="2019") %>%select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonnecommune_2019 <-subset(commune_2019, STATUS =="A") %>%select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# on enlève les lignes qui sont des aggrégatscommune_2019 <-subset(commune_2019, REGION !="Schweiz")commune_2019 <- commune_2019 %>%pivot_wider(names_from = INDICATORS, values_from = VALUE)# Rename columns using the provided mapdf_commune <- commune_2019 %>%rename(`Population - Habitants`= Ind_01_01,`Population - Densité de la population`= Ind_01_03,`Population - Etrangers`= Ind_01_08,`Population - Part du groupe d'âge 0-19 ans`= Ind_01_04,`Population - Part du groupe d'âge 20-64 ans`= Ind_01_05,`Population - Part du groupe d'âge 65+ ans`= Ind_01_06,`Population - Taux brut de nuptialité`= Ind_01_09,`Population - Taux brut de divortialité`= Ind_01_10,`Population - Taux brut de natalité`= Ind_01_11,`Population - Taux brut de mortalité`= Ind_01_12,`Population - Ménages privés`= Ind_01_13,`Population - Taille moyenne des ménages`= Ind_01_14,`Sécurité sociale - Taux d'aide sociale`= Ind_11_01,`Conseil national - PLR`= Ind_14_01,`Conseil national - PDC`= Ind_14_02,`Conseil national - PS`= Ind_14_03,`Conseil national - UDC`= Ind_14_04,`Conseil national - PEV/PCS`= Ind_14_05,`Conseil national - PVL`= Ind_14_06,`Conseil national - PBD`= Ind_14_07,`Conseil national - PST/Sol.`= Ind_14_08,`Conseil national - PES`= Ind_14_09,`Conseil national - Petits partis de droite`= Ind_14_10)# If no one voted for a party, set as NA -> replacing it with 0 insteaddf_commune <- df_commune %>%mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# Removing NAs from Taux de couverture sociale column# Setting the mean as the mean for Switzerland in 2019 (3.2%)mean_taux_aide_social <-3.2# Replace NA values with the meandf_commune <- df_commune %>%mutate(`Sécurité sociale - Taux d'aide sociale`=if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#show 100 first rows of df_commune using reactablereactable(head(df_commune, 100))
Code
# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)# # # We keep only 2019 to have some reference? (2020 is apparently not really complete)# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# # # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne# commune_2019 <- subset(commune_2019, STATUS == "A") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# # # on enlève les lignes qui sont des aggrégats# commune_2019 <- subset(commune_2019, REGION != "Schweiz")# # commune_2019 <- commune_2019 %>%# pivot_wider(names_from = INDICATORS, values_from = VALUE)# # # Rename columns using the provided map# df_commune <- commune_2019 %>%# rename(`Population - Habitants` = Ind_01_01,# `Population - Densité de la population` = Ind_01_03,# `Population - Etrangers` = Ind_01_08,# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,# `Population - Taux brut de nuptialité` = Ind_01_09,# `Population - Taux brut de divortialité` = Ind_01_10,# `Population - Taux brut de natalité` = Ind_01_11,# `Population - Taux brut de mortalité` = Ind_01_12,# `Population - Ménages privés` = Ind_01_13,# `Population - Taille moyenne des ménages` = Ind_01_14,# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,# `Conseil national - PLR` = Ind_14_01,# `Conseil national - PDC` = Ind_14_02,# `Conseil national - PS` = Ind_14_03,# `Conseil national - UDC` = Ind_14_04,# `Conseil national - PEV/PCS` = Ind_14_05,# `Conseil national - PVL` = Ind_14_06,# `Conseil national - PBD` = Ind_14_07,# `Conseil national - PST/Sol.` = Ind_14_08,# `Conseil national - PES` = Ind_14_09,# `Conseil national - Petits partis de droite` = Ind_14_10)# # # If no one voted for a party, set as NA -> replacing it with 0 instead# df_commune <- df_commune %>%# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# # # # Removing NAs from Taux de couverture sociale column# # Setting the mean as the mean for Switzerland in 2019 (3.2%)# mean_taux_aide_social <- 3.2# # # Replace NA values with the mean# df_commune <- df_commune %>%# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#
3 EDA
3.1 Histogram of prices
Code
histogram_price <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="red") +labs(title ="Distribution of Prices",x ="Price",y ="Frequency") +theme_minimal()# Convert ggplot object to plotly objectinteractive_histogram_price <-ggplotly(histogram_price, width =600, height =400 )# Display the interactive histograminteractive_histogram_price
3.2 Histogram of prices for each property type
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create the ggplot objecthistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ property_type, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Property Type",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram
3.3 Histogram of prices for each year category
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create a histogram of prices for each year categoryhistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ year_category, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Year Category",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram_year <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram_year
3.4 Histogram of prices for each canton
note : only price between 0 and 500000 so some outliers aren’t here
Code
histogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ canton, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Canton",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =600, height =1000) %>%layout(height =1000) # Adjust the height as needed# Display the interactive plotinteractive_histogram
3.5 Histogram of prices for each number of rooms
note : only price between 0 and 500000 so some outliers aren’t here
and the graph below only show apartments with less than 10 rooms (but you can change the code if needed
Code
properties_room <- properties_filtered[properties_filtered$number_of_rooms <20, ] # Filter only number_of_rooms less than 20# Create a histogram of prices for each number of roomshistogram <-ggplot(properties_room, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ number_of_rooms, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Number of Rooms",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram, width =600, height =1000)# Display the interactive plotinteractive_histogram
3.6 Histogram of prices with impot
Code
# colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)] <- gsub("\\s+", "_", colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)])# # # Create a scatter plot to visualize correlation between price and Impôt cantonal# scatter_plot <- ggplot(properties_filtered, aes(x = price, y = Impôt_cantonal_impots)) +# geom_point() +# labs(title = "Correlation between Price and Impôt cantonal",# x = "Price",# y = "Impôt cantonal") +# theme_minimal()# # # Convert ggplot object to plotly object# interactive_plot <- ggplotly(scatter_plot)# # # Display the interactive plot# interactive_plot
impot_cols <-names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]# Count the number of NA values in selected columnsna_counts <-colSums(is.na(properties_filtered[impot_cols]))# Print the countsprint(na_counts)#> numeric(0)
4 Supervised learning
Data splitting (if a training/test set split is enough for the global analysis, at least one CV or bootstrap must be used)
Two or more models
Two or more scores
Tuning of one or more hyperparameters per model
Interpretation of the model(s)
5 Unsupervised learning
Clustering and/or dimension reduction
Trying to Cluster commune datas to: 1. Reduce dimension 2. See similarities
A regarder, est-ce qu’on fait un cluster pour les datas politques + un cluster pour les data démographiques, ou est-ce qu’on regroupe tout?
Code
# faudra changer les colonnes "select" si on sépare demographique et politiquecols_cluster <-select(df_commune, -REGION, -CODE_REGION)# A reflechir, scale ou pas scale? pour l'instant je scalecols_cluster <-scale(cols_cluster)dist_matrix <-dist(cols_cluster, method ="euclidean")hclust <-hclust(dist_matrix, method ="centroid")dend <-as.dendrogram(hclust)dend <-color_branches(dend, k =26) # Adjust the number of clusters as neededy_zoom_range <-c(0, 10)par(cex =0.6) # Adjust the font size for better visualizationplot(dend, main ="Hierarchical Clustering Dendrogram", horiz =FALSE,ylim = y_zoom_range) # Set horiz to FALSE for vertical dendrogram# Cut the dendrogram to obtain clustersclusters <-cutree(hclust_model, k =5) # Adjust the number of clusters as needed
Code
library(dendextend)# Clean data and convert to numericcleaned_impots <-apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))cleaned_impots[is.na(cleaned_impots)] <-0# Replace NA values with 0# Scale the featuresscaled_impots <-scale(cleaned_impots)# Calculate distance matrixdist_matrix <-dist(scaled_impots, method ="euclidean")# Perform hierarchical clusteringhclust_model <-hclust(dist_matrix, method ="ward.D2")# Create dendrogramdend <-as.dendrogram(hclust_model)# Zoom range for the dendrogramy_zoom_range <-c(0, 10)# Adjust font size for better visualizationpar(cex =0.6)# Plot dendrogramplot(dend, main ="Hierarchical Clustering Dendrogram", horiz =FALSE, ylim = y_zoom_range) # Set horiz to FALSE for vertical dendrogram# Cut the dendrogram to obtain clustersclusters <-cutree(hclust_model, k =5) # Adjust the number of clusters as needed# Color branches according to clusterscolored_dend <-color_branches(dend, k =5)plot(colored_dend, main ="Hierarchical Clustering Dendrogram", horiz =FALSE, ylim = y_zoom_range)